{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a9ced852-425c-4ee0-952e-4fa1d758eca4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import pandas as pd\n",
    "from scipy.stats import mannwhitneyu\n",
    "from statsmodels.sandbox.stats.multicomp import multipletests\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "\n",
    "sns.set_style('whitegrid')\n",
    "\n",
    "def p_adjust(pvalues, method='fdr_bh'):\n",
    "    res = multipletests(pvalues, method=method)\n",
    "    return np.array(res[1], dtype=float)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "557c8ac7-e08d-48c9-af93-cb67852fb4d0",
   "metadata": {},
   "source": [
    "# Correlating Kraken abundances with continuous vaccine titers\n",
    "\n",
    "##### Michael Shaffer\n",
    "##### 7/21/2022\n",
    "##### Merck ESC, Sys bio group\n",
    "\n",
    "To find if any bacterial taxa are correlated with vaccine response we will pick time points (2, 4, 6, 8 and 12 months) and correlate the bacterial abundances at those timepoints with continuous vaccine response at one year."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4c86528-cf6b-498e-ab8b-83f984395db8",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Read in data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "44af0051-eb8d-4c74-a80e-c07d9bc7259f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SubmissionType</th>\n",
       "      <th>SampleNumber</th>\n",
       "      <th>SampleIDValidation</th>\n",
       "      <th>DiversigenCheckInSampleName</th>\n",
       "      <th>BoxLocation</th>\n",
       "      <th>SampleType</th>\n",
       "      <th>SampleSource</th>\n",
       "      <th>SequencingType</th>\n",
       "      <th>BabyN</th>\n",
       "      <th>BabyN_checked</th>\n",
       "      <th>...</th>\n",
       "      <th>median_mmNorm_PCV</th>\n",
       "      <th>median_mmNorm_DTAPHib</th>\n",
       "      <th>protectNorm_Dip</th>\n",
       "      <th>protectNorm_TET</th>\n",
       "      <th>protectNorm_PRP (Hib)</th>\n",
       "      <th>protectNorm_PT</th>\n",
       "      <th>protectNorm_PRN</th>\n",
       "      <th>protectNorm_FHA</th>\n",
       "      <th>geommean_protectNorm</th>\n",
       "      <th>VR_group_v2</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SampleID</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>106_V2</th>\n",
       "      <td>Primary in Tube</td>\n",
       "      <td>69</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Box 8, F3</td>\n",
       "      <td>Stool</td>\n",
       "      <td>Human Infant</td>\n",
       "      <td>MetaG</td>\n",
       "      <td>106</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>0.052874</td>\n",
       "      <td>2.1</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.6</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>1.375</td>\n",
       "      <td>1.140388</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106_V6</th>\n",
       "      <td>Primary in Tube</td>\n",
       "      <td>121</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Box 10, C1</td>\n",
       "      <td>Stool</td>\n",
       "      <td>Human Infant</td>\n",
       "      <td>MetaG</td>\n",
       "      <td>106</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>0.052874</td>\n",
       "      <td>2.1</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.6</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>1.375</td>\n",
       "      <td>1.140388</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106_V7</th>\n",
       "      <td>Primary in Tube</td>\n",
       "      <td>158</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Box 11, C3</td>\n",
       "      <td>Stool</td>\n",
       "      <td>Human Infant</td>\n",
       "      <td>MetaG</td>\n",
       "      <td>106</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>0.052874</td>\n",
       "      <td>2.1</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.6</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>1.375</td>\n",
       "      <td>1.140388</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106_S_1</th>\n",
       "      <td>Primary in Tube</td>\n",
       "      <td>162</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Box 11, D1</td>\n",
       "      <td>Stool</td>\n",
       "      <td>Human Infant</td>\n",
       "      <td>MetaG</td>\n",
       "      <td>106</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>0.052874</td>\n",
       "      <td>2.1</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.6</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>1.375</td>\n",
       "      <td>1.140388</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106_A1</th>\n",
       "      <td>Primary in Tube</td>\n",
       "      <td>188</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Box 12, B3</td>\n",
       "      <td>Stool</td>\n",
       "      <td>Human Infant</td>\n",
       "      <td>MetaG</td>\n",
       "      <td>106</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.061955</td>\n",
       "      <td>0.052874</td>\n",
       "      <td>2.1</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.6</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>0.3125</td>\n",
       "      <td>1.375</td>\n",
       "      <td>1.140388</td>\n",
       "      <td>NVR</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 86 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           SubmissionType  SampleNumber  SampleIDValidation  \\\n",
       "SampleID                                                      \n",
       "106_V2    Primary in Tube            69                 NaN   \n",
       "106_V6    Primary in Tube           121                 NaN   \n",
       "106_V7    Primary in Tube           158                 NaN   \n",
       "106_S_1   Primary in Tube           162                 NaN   \n",
       "106_A1    Primary in Tube           188                 NaN   \n",
       "\n",
       "         DiversigenCheckInSampleName BoxLocation SampleType  SampleSource  \\\n",
       "SampleID                                                                    \n",
       "106_V2                           NaN   Box 8, F3      Stool  Human Infant   \n",
       "106_V6                           NaN  Box 10, C1      Stool  Human Infant   \n",
       "106_V7                           NaN  Box 11, C3      Stool  Human Infant   \n",
       "106_S_1                          NaN  Box 11, D1      Stool  Human Infant   \n",
       "106_A1                           NaN  Box 12, B3      Stool  Human Infant   \n",
       "\n",
       "         SequencingType  BabyN  BabyN_checked  ... median_mmNorm_PCV  \\\n",
       "SampleID                                       ...                     \n",
       "106_V2            MetaG    106            NaN  ...          0.061955   \n",
       "106_V6            MetaG    106            NaN  ...          0.061955   \n",
       "106_V7            MetaG    106            NaN  ...          0.061955   \n",
       "106_S_1           MetaG    106            NaN  ...          0.061955   \n",
       "106_A1            MetaG    106            NaN  ...          0.061955   \n",
       "\n",
       "          median_mmNorm_DTAPHib protectNorm_Dip  protectNorm_TET  \\\n",
       "SampleID                                                           \n",
       "106_V2                 0.052874             2.1              3.0   \n",
       "106_V6                 0.052874             2.1              3.0   \n",
       "106_V7                 0.052874             2.1              3.0   \n",
       "106_S_1                0.052874             2.1              3.0   \n",
       "106_A1                 0.052874             2.1              3.0   \n",
       "\n",
       "         protectNorm_PRP (Hib) protectNorm_PT  protectNorm_PRN  \\\n",
       "SampleID                                                         \n",
       "106_V2                     2.6         0.3125           0.3125   \n",
       "106_V6                     2.6         0.3125           0.3125   \n",
       "106_V7                     2.6         0.3125           0.3125   \n",
       "106_S_1                    2.6         0.3125           0.3125   \n",
       "106_A1                     2.6         0.3125           0.3125   \n",
       "\n",
       "         protectNorm_FHA  geommean_protectNorm  VR_group_v2  \n",
       "SampleID                                                     \n",
       "106_V2             1.375              1.140388          NVR  \n",
       "106_V6             1.375              1.140388          NVR  \n",
       "106_V7             1.375              1.140388          NVR  \n",
       "106_S_1            1.375              1.140388          NVR  \n",
       "106_A1             1.375              1.140388          NVR  \n",
       "\n",
       "[5 rows x 86 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "meta = pd.read_csv('../../data/metadata/stool/stool_metadata.csv', index_col='SampleID')\n",
    "meta = pd.concat([meta,\n",
    "                  pd.read_csv('../../data/metadata/stool/stool_abx_usage.csv', index_col='SampleID'),\n",
    "                  pd.read_csv('../../data/metadata/stool/stool_titers_yr1.csv', index_col='SampleID')],\n",
    "                 axis=1)\n",
    "meta['VR_group'] = meta['VR_group'].fillna('Not Measured')\n",
    "meta = meta.sort_values(['BabyN', 'age_at_collection'])\n",
    "meta = meta.loc[~pd.isna(meta['median_mmNorm'])]\n",
    "meta.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f8927715-e07c-416a-a625-563c192b5b9f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(971, 529)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>106_V2</th>\n",
       "      <th>106_V6</th>\n",
       "      <th>106_V7</th>\n",
       "      <th>106_S_1</th>\n",
       "      <th>106_A1</th>\n",
       "      <th>107_V2</th>\n",
       "      <th>107_V3</th>\n",
       "      <th>107_V6</th>\n",
       "      <th>107_S1</th>\n",
       "      <th>107_V7</th>\n",
       "      <th>...</th>\n",
       "      <th>264_S2F</th>\n",
       "      <th>264_V9</th>\n",
       "      <th>264_V10</th>\n",
       "      <th>264_V11</th>\n",
       "      <th>264_V12</th>\n",
       "      <th>265_V2</th>\n",
       "      <th>265_V5</th>\n",
       "      <th>265_V6</th>\n",
       "      <th>265_V8</th>\n",
       "      <th>265_S1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>d__Bacteria|g__Thermobaculum</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d__Bacteria|p__Acidobacteria|c__Acidobacteriia|o__Acidobacteriales|f__Acidobacteriaceae|g__Acidobacterium</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d__Bacteria|p__Acidobacteria|c__Acidobacteriia|o__Acidobacteriales|f__Acidobacteriaceae|g__Candidatus_Koribacter</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d__Bacteria|p__Acidobacteria|c__Acidobacteriia|o__Acidobacteriales|f__Acidobacteriaceae|g__Granulicella</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d__Bacteria|p__Acidobacteria|c__Acidobacteriia|o__Acidobacteriales|f__Acidobacteriaceae|g__Terriglobus</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 529 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    106_V2  106_V6  106_V7  \\\n",
       "d__Bacteria|g__Thermobaculum                           0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0    17.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0     0.0   \n",
       "\n",
       "                                                    106_S_1  106_A1  107_V2  \\\n",
       "d__Bacteria|g__Thermobaculum                            0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...      0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...      4.0     1.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...      0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...      0.0     0.0     0.0   \n",
       "\n",
       "                                                    107_V3  107_V6  107_S1  \\\n",
       "d__Bacteria|g__Thermobaculum                           0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0     0.0   \n",
       "\n",
       "                                                    107_V7  ...  264_S2F  \\\n",
       "d__Bacteria|g__Thermobaculum                           0.0  ...      0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0  ...      0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0  ...      0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0  ...      0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0  ...      0.0   \n",
       "\n",
       "                                                    264_V9  264_V10  264_V11  \\\n",
       "d__Bacteria|g__Thermobaculum                           0.0      0.0      1.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0      0.0     10.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     1.0      1.0      2.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     5.0      1.0     13.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0      1.0     14.0   \n",
       "\n",
       "                                                    264_V12  265_V2  265_V5  \\\n",
       "d__Bacteria|g__Thermobaculum                            0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...      4.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...      5.0     0.0     2.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...      6.0     0.0     1.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...      3.0     0.0     2.0   \n",
       "\n",
       "                                                    265_V6  265_V8  265_S1  \n",
       "d__Bacteria|g__Thermobaculum                           0.0     0.0     0.0  \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0     0.0  \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     4.0     0.0  \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     8.0     0.0  \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     3.0     0.0  \n",
       "\n",
       "[5 rows x 529 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "kraken_genus_abunds = pd.read_csv('../../data/stool/kraken_taxa_level_abunds/kraken_genus_abunds.tsv', sep='\\t', index_col=0)\n",
    "kraken_genus_abunds = kraken_genus_abunds[meta.query(\"`gt_2.5` == True\").index]\n",
    "print(kraken_genus_abunds.shape)\n",
    "display(kraken_genus_abunds.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ed06601c-88e5-4ba9-a274-63af571183a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(320, 529)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>106_V2</th>\n",
       "      <th>106_V6</th>\n",
       "      <th>106_V7</th>\n",
       "      <th>106_S_1</th>\n",
       "      <th>106_A1</th>\n",
       "      <th>107_V2</th>\n",
       "      <th>107_V3</th>\n",
       "      <th>107_V6</th>\n",
       "      <th>107_S1</th>\n",
       "      <th>107_V7</th>\n",
       "      <th>...</th>\n",
       "      <th>264_S2F</th>\n",
       "      <th>264_V9</th>\n",
       "      <th>264_V10</th>\n",
       "      <th>264_V11</th>\n",
       "      <th>264_V12</th>\n",
       "      <th>265_V2</th>\n",
       "      <th>265_V5</th>\n",
       "      <th>265_V6</th>\n",
       "      <th>265_V8</th>\n",
       "      <th>265_S1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>d__Bacteria|p__Acidobacteria|c__Acidobacteriia|o__Acidobacteriales|f__Acidobacteriaceae</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>39.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d__Bacteria|p__Acidobacteria|c__Solibacteres|o__Solibacterales|f__Solibacteraceae</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d__Bacteria|p__Actinobacteria|c__Acidimicrobiia|o__Acidimicrobiales|f__Acidimicrobiaceae</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Acidothermales|f__Acidothermaceae</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>d__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae</th>\n",
       "      <td>0.0</td>\n",
       "      <td>163.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>283.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>87.0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>18.0</td>\n",
       "      <td>...</td>\n",
       "      <td>1817.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>33.0</td>\n",
       "      <td>33.0</td>\n",
       "      <td>48.0</td>\n",
       "      <td>98.0</td>\n",
       "      <td>14.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>64.0</td>\n",
       "      <td>174.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 529 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    106_V2  106_V6  106_V7  \\\n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0    17.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Solibacteres|o_...     0.0     0.0    14.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Acidimicrobiia...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...     0.0   163.0     6.0   \n",
       "\n",
       "                                                    106_S_1  106_A1  107_V2  \\\n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...      4.0     1.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Solibacteres|o_...      4.0     2.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Acidimicrobiia...      0.0     0.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...      0.0     0.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...      5.0   283.0     0.0   \n",
       "\n",
       "                                                    107_V3  107_V6  107_S1  \\\n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Solibacteres|o_...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Acidimicrobiia...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...     0.0     0.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...     9.0    87.0    82.0   \n",
       "\n",
       "                                                    107_V7  ...  264_S2F  \\\n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0  ...      0.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Solibacteres|o_...     0.0  ...      0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Acidimicrobiia...     0.0  ...      0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...     0.0  ...      0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...    18.0  ...   1817.0   \n",
       "\n",
       "                                                    264_V9  264_V10  264_V11  \\\n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     6.0      3.0     39.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Solibacteres|o_...     0.0      0.0      0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Acidimicrobiia...     0.0      0.0      0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...     0.0      0.0      0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...    32.0     33.0     33.0   \n",
       "\n",
       "                                                    264_V12  265_V2  265_V5  \\\n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     20.0     0.0     5.0   \n",
       "d__Bacteria|p__Acidobacteria|c__Solibacteres|o_...      0.0     0.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Acidimicrobiia...      0.0     0.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...      0.0     0.0     0.0   \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...     48.0    98.0    14.0   \n",
       "\n",
       "                                                    265_V6  265_V8  265_S1  \n",
       "d__Bacteria|p__Acidobacteria|c__Acidobacteriia|...     0.0    15.0     0.0  \n",
       "d__Bacteria|p__Acidobacteria|c__Solibacteres|o_...     0.0     0.0     0.0  \n",
       "d__Bacteria|p__Actinobacteria|c__Acidimicrobiia...     0.0     0.0     0.0  \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...     0.0     0.0     0.0  \n",
       "d__Bacteria|p__Actinobacteria|c__Actinobacteria...    16.0    64.0   174.0  \n",
       "\n",
       "[5 rows x 529 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "kraken_family_abunds = pd.read_csv('../../data/stool/kraken_taxa_level_abunds/kraken_family_abunds.tsv', sep='\\t', index_col=0)\n",
    "kraken_family_abunds = kraken_family_abunds[meta.query(\"`gt_2.5` == True\").index]\n",
    "print(kraken_family_abunds.shape)\n",
    "display(kraken_family_abunds.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0cd4ea1f-112a-4f0b-a80f-1a9563a2c0a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(529, 86)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/6t/1w2t3qmd1rx81mfw9sq_tfpr0000gn/T/ipykernel_12533/1022001642.py:2: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.\n",
      "  meta_matched = meta.loc[in_both]\n"
     ]
    }
   ],
   "source": [
    "in_both = set(meta.index) & set(kraken_genus_abunds.columns)\n",
    "meta_matched = meta.loc[in_both]\n",
    "print(meta_matched.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "af902bea-8307-428d-a875-39eef4b38e25",
   "metadata": {},
   "outputs": [],
   "source": [
    "meta_v5 = meta_matched.query(\"VisitCode == 'V5'\")\n",
    "meta_v6 = meta_matched.query(\"VisitCode == 'V6'\")\n",
    "meta_v7 = meta_matched.query(\"VisitCode == 'V7'\")\n",
    "meta_v9 = meta_matched.query(\"VisitCode == 'V9'\")\n",
    "\n",
    "meta_PCV = meta_matched.loc[~pd.isna(meta_matched['median_mmNorm_PCV'])]\n",
    "meta_PCV_v5 = meta_PCV.query(\"VisitCode == 'V5'\")\n",
    "meta_PCV_v6 = meta_PCV.query(\"VisitCode == 'V6'\")\n",
    "meta_PCV_v7 = meta_PCV.query(\"VisitCode == 'V7'\")\n",
    "meta_PCV_v9 = meta_PCV.query(\"VisitCode == 'V9'\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e0151bb-6764-4c59-99c3-a695d0796ae3",
   "metadata": {},
   "source": [
    "## Genus level association with VRness"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "57169c77-5b4e-41cb-a066-c1847d876425",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>genus</th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Gammaproteoba...</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>288.166667</td>\n",
       "      <td>58.0</td>\n",
       "      <td>0.022971</td>\n",
       "      <td>0.957382</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>d__Bacteria|p__Actinobacteria|c__Coriobacterii...</td>\n",
       "      <td>16.428571</td>\n",
       "      <td>5.750000</td>\n",
       "      <td>171.5</td>\n",
       "      <td>0.053512</td>\n",
       "      <td>0.957382</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...</td>\n",
       "      <td>6704.285714</td>\n",
       "      <td>2241.083333</td>\n",
       "      <td>183.0</td>\n",
       "      <td>0.062108</td>\n",
       "      <td>0.957382</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Gammaproteoba...</td>\n",
       "      <td>40.142857</td>\n",
       "      <td>5.972222</td>\n",
       "      <td>180.5</td>\n",
       "      <td>0.067442</td>\n",
       "      <td>0.957382</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Gammaproteoba...</td>\n",
       "      <td>18.285714</td>\n",
       "      <td>1.555556</td>\n",
       "      <td>171.0</td>\n",
       "      <td>0.078602</td>\n",
       "      <td>0.957382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 genus     LVR_mean  \\\n",
       "115  d__Bacteria|p__Proteobacteria|c__Gammaproteoba...     0.571429   \n",
       "11   d__Bacteria|p__Actinobacteria|c__Coriobacterii...    16.428571   \n",
       "34   d__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...  6704.285714   \n",
       "94   d__Bacteria|p__Proteobacteria|c__Gammaproteoba...    40.142857   \n",
       "109  d__Bacteria|p__Proteobacteria|c__Gammaproteoba...    18.285714   \n",
       "\n",
       "        NVR_mean  statistic   p_value     p_adj  \n",
       "115   288.166667       58.0  0.022971  0.957382  \n",
       "11      5.750000      171.5  0.053512  0.957382  \n",
       "34   2241.083333      183.0  0.062108  0.957382  \n",
       "94      5.972222      180.5  0.067442  0.957382  \n",
       "109     1.555556      171.0  0.078602  0.957382  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kraken_genus_abunds_v5 = kraken_genus_abunds[meta_v5.index]\n",
    "kraken_genus_stats_v5_rows = list()\n",
    "for genus, row in kraken_genus_abunds_v5.iterrows():\n",
    "    lvr_abunds = row[meta_v5.query('VR_group == \"LVR\"').index]\n",
    "    nvr_abunds = row[meta_v5.query('VR_group == \"NVR\"').index]\n",
    "    # check for not all zeros\n",
    "    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2\n",
    "    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2\n",
    "    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10\n",
    "    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10\n",
    "    if lvr_gt_20 or nvr_gt_20:\n",
    "        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)\n",
    "        kraken_genus_stats_v5_rows.append([genus, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])\n",
    "kraken_genus_stats_v5 = pd.DataFrame(kraken_genus_stats_v5_rows, columns=['genus', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')\n",
    "kraken_genus_stats_v5['p_adj'] = p_adjust(kraken_genus_stats_v5['p_value'])\n",
    "kraken_genus_stats_v5.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6bbafdb6-6913-469b-befd-1382a657f7ed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>genus</th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>d__Bacteria|p__Actinobacteria|c__Actinobacteri...</td>\n",
       "      <td>77.833333</td>\n",
       "      <td>1.484848</td>\n",
       "      <td>293.5</td>\n",
       "      <td>0.002944</td>\n",
       "      <td>0.223614</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>d__Bacteria|p__Actinobacteria|c__Coriobacterii...</td>\n",
       "      <td>127.833333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>247.5</td>\n",
       "      <td>0.003636</td>\n",
       "      <td>0.223614</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>d__Bacteria|p__Actinobacteria|c__Coriobacterii...</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>13.909091</td>\n",
       "      <td>286.5</td>\n",
       "      <td>0.011924</td>\n",
       "      <td>0.376696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Alphaproteoba...</td>\n",
       "      <td>10.583333</td>\n",
       "      <td>2.818182</td>\n",
       "      <td>283.0</td>\n",
       "      <td>0.015719</td>\n",
       "      <td>0.376696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>79</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Deltaproteoba...</td>\n",
       "      <td>12.833333</td>\n",
       "      <td>0.151515</td>\n",
       "      <td>260.0</td>\n",
       "      <td>0.017698</td>\n",
       "      <td>0.376696</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                genus    LVR_mean   NVR_mean  \\\n",
       "2   d__Bacteria|p__Actinobacteria|c__Actinobacteri...   77.833333   1.484848   \n",
       "13  d__Bacteria|p__Actinobacteria|c__Coriobacterii...  127.833333   0.000000   \n",
       "17  d__Bacteria|p__Actinobacteria|c__Coriobacterii...   11.000000  13.909091   \n",
       "71  d__Bacteria|p__Proteobacteria|c__Alphaproteoba...   10.583333   2.818182   \n",
       "79  d__Bacteria|p__Proteobacteria|c__Deltaproteoba...   12.833333   0.151515   \n",
       "\n",
       "    statistic   p_value     p_adj  \n",
       "2       293.5  0.002944  0.223614  \n",
       "13      247.5  0.003636  0.223614  \n",
       "17      286.5  0.011924  0.376696  \n",
       "71      283.0  0.015719  0.376696  \n",
       "79      260.0  0.017698  0.376696  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kraken_genus_abunds_v6 = kraken_genus_abunds[meta_v6.index]\n",
    "kraken_genus_stats_v6_rows = list()\n",
    "for genus, row in kraken_genus_abunds_v6.iterrows():\n",
    "    lvr_abunds = row[meta_v6.query('VR_group == \"LVR\"').index]\n",
    "    nvr_abunds = row[meta_v6.query('VR_group == \"NVR\"').index]\n",
    "    # check for not all zeros\n",
    "    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2\n",
    "    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2\n",
    "    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10\n",
    "    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10\n",
    "    if lvr_gt_20 or nvr_gt_20:\n",
    "        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)\n",
    "        kraken_genus_stats_v6_rows.append([genus, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])\n",
    "kraken_genus_stats_v6 = pd.DataFrame(kraken_genus_stats_v6_rows, columns=['genus', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')\n",
    "kraken_genus_stats_v6['p_adj'] = p_adjust(kraken_genus_stats_v6['p_value'])\n",
    "kraken_genus_stats_v6.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fbfe8e12-0392-4276-9fdb-d77bb4fbbd09",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>genus</th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>d__Bacteria|p__Bacteroidetes|c__Cytophagia|o__...</td>\n",
       "      <td>11.727273</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>323.0</td>\n",
       "      <td>0.001031</td>\n",
       "      <td>0.080917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...</td>\n",
       "      <td>563.272727</td>\n",
       "      <td>0.923077</td>\n",
       "      <td>343.0</td>\n",
       "      <td>0.001250</td>\n",
       "      <td>0.080917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>d__Bacteria|p__Actinobacteria|c__Actinobacteri...</td>\n",
       "      <td>10.818182</td>\n",
       "      <td>2.307692</td>\n",
       "      <td>343.0</td>\n",
       "      <td>0.001853</td>\n",
       "      <td>0.080917</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...</td>\n",
       "      <td>19.636364</td>\n",
       "      <td>14.051282</td>\n",
       "      <td>321.5</td>\n",
       "      <td>0.009481</td>\n",
       "      <td>0.261978</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...</td>\n",
       "      <td>279.636364</td>\n",
       "      <td>136.282051</td>\n",
       "      <td>324.0</td>\n",
       "      <td>0.010628</td>\n",
       "      <td>0.261978</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                genus    LVR_mean    NVR_mean  \\\n",
       "33  d__Bacteria|p__Bacteroidetes|c__Cytophagia|o__...   11.727273    2.000000   \n",
       "46  d__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...  563.272727    0.923077   \n",
       "5   d__Bacteria|p__Actinobacteria|c__Actinobacteri...   10.818182    2.307692   \n",
       "64  d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...   19.636364   14.051282   \n",
       "49  d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...  279.636364  136.282051   \n",
       "\n",
       "    statistic   p_value     p_adj  \n",
       "33      323.0  0.001031  0.080917  \n",
       "46      343.0  0.001250  0.080917  \n",
       "5       343.0  0.001853  0.080917  \n",
       "64      321.5  0.009481  0.261978  \n",
       "49      324.0  0.010628  0.261978  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kraken_genus_abunds_v7 = kraken_genus_abunds[meta_v7.index]\n",
    "kraken_genus_stats_v7_rows = list()\n",
    "for genus, row in kraken_genus_abunds_v7.iterrows():\n",
    "    lvr_abunds = row[meta_v7.query('VR_group == \"LVR\"').index]\n",
    "    nvr_abunds = row[meta_v7.query('VR_group == \"NVR\"').index]\n",
    "    # check for not all zeros\n",
    "    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2\n",
    "    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2\n",
    "    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10\n",
    "    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10\n",
    "    if lvr_gt_20 or nvr_gt_20:\n",
    "        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)\n",
    "        kraken_genus_stats_v7_rows.append([genus, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])\n",
    "kraken_genus_stats_v7 = pd.DataFrame(kraken_genus_stats_v7_rows, columns=['genus', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')\n",
    "kraken_genus_stats_v7['p_adj'] = p_adjust(kraken_genus_stats_v7['p_value'])\n",
    "kraken_genus_stats_v7.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "aca564d7-f291-4a2e-b212-ea974594f62e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>genus</th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Betaproteobac...</td>\n",
       "      <td>5.857143</td>\n",
       "      <td>26.457143</td>\n",
       "      <td>54.5</td>\n",
       "      <td>0.022194</td>\n",
       "      <td>0.669116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Negativicutes|o__...</td>\n",
       "      <td>1624.857143</td>\n",
       "      <td>43.342857</td>\n",
       "      <td>189.5</td>\n",
       "      <td>0.023157</td>\n",
       "      <td>0.669116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Gammaproteoba...</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>44.828571</td>\n",
       "      <td>57.0</td>\n",
       "      <td>0.028170</td>\n",
       "      <td>0.669116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Bacilli|o__Bacill...</td>\n",
       "      <td>37.285714</td>\n",
       "      <td>18.285714</td>\n",
       "      <td>186.5</td>\n",
       "      <td>0.031915</td>\n",
       "      <td>0.669116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>d__Bacteria|p__Actinobacteria|c__Rubrobacteria...</td>\n",
       "      <td>3.857143</td>\n",
       "      <td>21.914286</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.035015</td>\n",
       "      <td>0.669116</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 genus     LVR_mean  \\\n",
       "112  d__Bacteria|p__Proteobacteria|c__Betaproteobac...     5.857143   \n",
       "85   d__Bacteria|p__Firmicutes|c__Negativicutes|o__...  1624.857143   \n",
       "142  d__Bacteria|p__Proteobacteria|c__Gammaproteoba...    10.000000   \n",
       "50   d__Bacteria|p__Firmicutes|c__Bacilli|o__Bacill...    37.285714   \n",
       "21   d__Bacteria|p__Actinobacteria|c__Rubrobacteria...     3.857143   \n",
       "\n",
       "      NVR_mean  statistic   p_value     p_adj  \n",
       "112  26.457143       54.5  0.022194  0.669116  \n",
       "85   43.342857      189.5  0.023157  0.669116  \n",
       "142  44.828571       57.0  0.028170  0.669116  \n",
       "50   18.285714      186.5  0.031915  0.669116  \n",
       "21   21.914286       60.0  0.035015  0.669116  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kraken_genus_abunds_v9 = kraken_genus_abunds[meta_v9.index]\n",
    "kraken_genus_stats_v9_rows = list()\n",
    "for genus, row in kraken_genus_abunds_v9.iterrows():\n",
    "    lvr_abunds = row[meta_v9.query('VR_group == \"LVR\"').index]\n",
    "    nvr_abunds = row[meta_v9.query('VR_group == \"NVR\"').index]\n",
    "    # check for not all zeros\n",
    "    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2\n",
    "    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2\n",
    "    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10\n",
    "    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10\n",
    "    if lvr_gt_20 or nvr_gt_20:\n",
    "        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)\n",
    "        kraken_genus_stats_v9_rows.append([genus, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])\n",
    "kraken_genus_stats_v9 = pd.DataFrame(kraken_genus_stats_v9_rows, columns=['genus', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')\n",
    "kraken_genus_stats_v9['p_adj'] = p_adjust(kraken_genus_stats_v9['p_value'])\n",
    "kraken_genus_stats_v9.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a06fb90e-1596-47ba-a582-e516a30f6bf2",
   "metadata": {},
   "source": [
    "Nothing is significant at the genus level."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04be123e-d2e2-493e-8734-78312dd51d6d",
   "metadata": {},
   "source": [
    "## Family level correlations with median of all titers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b54077c5-5752-4b3b-a86c-0414566048b2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>family</th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>d__Bacteria|p__Bacteroidetes|c__Sphingobacteri...</td>\n",
       "      <td>13.714286</td>\n",
       "      <td>12.666667</td>\n",
       "      <td>184.5</td>\n",
       "      <td>0.053241</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...</td>\n",
       "      <td>249.857143</td>\n",
       "      <td>93.444444</td>\n",
       "      <td>185.0</td>\n",
       "      <td>0.054091</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...</td>\n",
       "      <td>6734.857143</td>\n",
       "      <td>2256.166667</td>\n",
       "      <td>183.0</td>\n",
       "      <td>0.062108</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>71</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Gammaproteoba...</td>\n",
       "      <td>41.714286</td>\n",
       "      <td>116.194444</td>\n",
       "      <td>174.0</td>\n",
       "      <td>0.117916</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>66</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Gammaproteoba...</td>\n",
       "      <td>11.000000</td>\n",
       "      <td>5.333333</td>\n",
       "      <td>166.5</td>\n",
       "      <td>0.121811</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               family     LVR_mean  \\\n",
       "20  d__Bacteria|p__Bacteroidetes|c__Sphingobacteri...    13.714286   \n",
       "36  d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...   249.857143   \n",
       "26  d__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...  6734.857143   \n",
       "71  d__Bacteria|p__Proteobacteria|c__Gammaproteoba...    41.714286   \n",
       "66  d__Bacteria|p__Proteobacteria|c__Gammaproteoba...    11.000000   \n",
       "\n",
       "       NVR_mean  statistic   p_value  p_adj  \n",
       "20    12.666667      184.5  0.053241    1.0  \n",
       "36    93.444444      185.0  0.054091    1.0  \n",
       "26  2256.166667      183.0  0.062108    1.0  \n",
       "71   116.194444      174.0  0.117916    1.0  \n",
       "66     5.333333      166.5  0.121811    1.0  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kraken_family_abunds_v5 = kraken_family_abunds[meta_v5.index]\n",
    "kraken_family_stats_v5_rows = list()\n",
    "for family, row in kraken_family_abunds_v5.iterrows():\n",
    "    lvr_abunds = row[meta_v5.query('VR_group == \"LVR\"').index]\n",
    "    nvr_abunds = row[meta_v5.query('VR_group == \"NVR\"').index]\n",
    "    # check for not all zeros\n",
    "    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2\n",
    "    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2\n",
    "    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10\n",
    "    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10\n",
    "    if lvr_gt_20 or nvr_gt_20:\n",
    "        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)\n",
    "        kraken_family_stats_v5_rows.append([family, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])\n",
    "kraken_family_stats_v5 = pd.DataFrame(kraken_family_stats_v5_rows, columns=['family', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')\n",
    "kraken_family_stats_v5['p_adj'] = p_adjust(kraken_family_stats_v5['p_value'])\n",
    "kraken_family_stats_v5.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "85eb8926-3f0b-4ff3-9cfa-df94107d52c1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>family</th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Alphaproteoba...</td>\n",
       "      <td>18.416667</td>\n",
       "      <td>4.303030</td>\n",
       "      <td>293.0</td>\n",
       "      <td>0.012372</td>\n",
       "      <td>0.895142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Negativicutes|o__...</td>\n",
       "      <td>10795.916667</td>\n",
       "      <td>148.272727</td>\n",
       "      <td>270.0</td>\n",
       "      <td>0.052819</td>\n",
       "      <td>0.895142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>d__Bacteria|p__Actinobacteria|c__Actinobacteri...</td>\n",
       "      <td>11.833333</td>\n",
       "      <td>2.424242</td>\n",
       "      <td>267.0</td>\n",
       "      <td>0.068371</td>\n",
       "      <td>0.895142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>d__Bacteria|p__Actinobacteria|c__Actinobacteri...</td>\n",
       "      <td>10.666667</td>\n",
       "      <td>6.909091</td>\n",
       "      <td>268.0</td>\n",
       "      <td>0.071678</td>\n",
       "      <td>0.895142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...</td>\n",
       "      <td>219.416667</td>\n",
       "      <td>137.848485</td>\n",
       "      <td>264.5</td>\n",
       "      <td>0.089834</td>\n",
       "      <td>0.895142</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               family      LVR_mean  \\\n",
       "50  d__Bacteria|p__Proteobacteria|c__Alphaproteoba...     18.416667   \n",
       "44  d__Bacteria|p__Firmicutes|c__Negativicutes|o__...  10795.916667   \n",
       "7   d__Bacteria|p__Actinobacteria|c__Actinobacteri...     11.833333   \n",
       "3   d__Bacteria|p__Actinobacteria|c__Actinobacteri...     10.666667   \n",
       "42  d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...    219.416667   \n",
       "\n",
       "      NVR_mean  statistic   p_value     p_adj  \n",
       "50    4.303030      293.0  0.012372  0.895142  \n",
       "44  148.272727      270.0  0.052819  0.895142  \n",
       "7     2.424242      267.0  0.068371  0.895142  \n",
       "3     6.909091      268.0  0.071678  0.895142  \n",
       "42  137.848485      264.5  0.089834  0.895142  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kraken_family_abunds_v6 = kraken_family_abunds[meta_v6.index]\n",
    "kraken_family_stats_v6_rows = list()\n",
    "for family, row in kraken_family_abunds_v6.iterrows():\n",
    "    lvr_abunds = row[meta_v6.query('VR_group == \"LVR\"').index]\n",
    "    nvr_abunds = row[meta_v6.query('VR_group == \"NVR\"').index]\n",
    "    # check for not all zeros\n",
    "    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2\n",
    "    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2\n",
    "    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10\n",
    "    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10\n",
    "    if lvr_gt_20 or nvr_gt_20:\n",
    "        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)\n",
    "        kraken_family_stats_v6_rows.append([family, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])\n",
    "kraken_family_stats_v6 = pd.DataFrame(kraken_family_stats_v6_rows, columns=['family', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')\n",
    "kraken_family_stats_v6['p_adj'] = p_adjust(kraken_family_stats_v6['p_value'])\n",
    "kraken_family_stats_v6.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1e9aadb9-5744-4601-bef2-56c3895824d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>family</th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...</td>\n",
       "      <td>3839.636364</td>\n",
       "      <td>720.487179</td>\n",
       "      <td>325.0</td>\n",
       "      <td>0.009975</td>\n",
       "      <td>0.315375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...</td>\n",
       "      <td>279.636364</td>\n",
       "      <td>136.282051</td>\n",
       "      <td>324.0</td>\n",
       "      <td>0.010628</td>\n",
       "      <td>0.315375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...</td>\n",
       "      <td>3690.454545</td>\n",
       "      <td>2043.564103</td>\n",
       "      <td>316.0</td>\n",
       "      <td>0.017996</td>\n",
       "      <td>0.315375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>d__Bacteria|p__Actinobacteria|c__Actinobacteri...</td>\n",
       "      <td>13.363636</td>\n",
       "      <td>6.794872</td>\n",
       "      <td>314.0</td>\n",
       "      <td>0.018216</td>\n",
       "      <td>0.315375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>d__Bacteria|p__Bacteroidetes|c__Sphingobacteri...</td>\n",
       "      <td>28.272727</td>\n",
       "      <td>13.974359</td>\n",
       "      <td>313.5</td>\n",
       "      <td>0.020849</td>\n",
       "      <td>0.315375</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               family     LVR_mean  \\\n",
       "44  d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...  3839.636364   \n",
       "37  d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...   279.636364   \n",
       "39  d__Bacteria|p__Firmicutes|c__Clostridia|o__Clo...  3690.454545   \n",
       "3   d__Bacteria|p__Actinobacteria|c__Actinobacteri...    13.363636   \n",
       "25  d__Bacteria|p__Bacteroidetes|c__Sphingobacteri...    28.272727   \n",
       "\n",
       "       NVR_mean  statistic   p_value     p_adj  \n",
       "44   720.487179      325.0  0.009975  0.315375  \n",
       "37   136.282051      324.0  0.010628  0.315375  \n",
       "39  2043.564103      316.0  0.017996  0.315375  \n",
       "3      6.794872      314.0  0.018216  0.315375  \n",
       "25    13.974359      313.5  0.020849  0.315375  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kraken_family_abunds_v7 = kraken_family_abunds[meta_v7.index]\n",
    "kraken_family_stats_v7_rows = list()\n",
    "for family, row in kraken_family_abunds_v7.iterrows():\n",
    "    lvr_abunds = row[meta_v7.query('VR_group == \"LVR\"').index]\n",
    "    nvr_abunds = row[meta_v7.query('VR_group == \"NVR\"').index]\n",
    "    # check for not all zeros\n",
    "    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2\n",
    "    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2\n",
    "    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10\n",
    "    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10\n",
    "    if lvr_gt_20 or nvr_gt_20:\n",
    "        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)\n",
    "        kraken_family_stats_v7_rows.append([family, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])\n",
    "kraken_family_stats_v7 = pd.DataFrame(kraken_family_stats_v7_rows, columns=['family', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')\n",
    "kraken_family_stats_v7['p_adj'] = p_adjust(kraken_family_stats_v7['p_value'])\n",
    "kraken_family_stats_v7.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f11775af-f323-441f-a848-9dc3e661c8ae",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>family</th>\n",
       "      <th>LVR_mean</th>\n",
       "      <th>NVR_mean</th>\n",
       "      <th>statistic</th>\n",
       "      <th>p_value</th>\n",
       "      <th>p_adj</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>79</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Betaproteobac...</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>89.057143</td>\n",
       "      <td>39.5</td>\n",
       "      <td>0.005348</td>\n",
       "      <td>0.394218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Alphaproteoba...</td>\n",
       "      <td>5.857143</td>\n",
       "      <td>19.828571</td>\n",
       "      <td>51.0</td>\n",
       "      <td>0.016463</td>\n",
       "      <td>0.394218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Gammaproteoba...</td>\n",
       "      <td>9.285714</td>\n",
       "      <td>42.857143</td>\n",
       "      <td>52.0</td>\n",
       "      <td>0.018101</td>\n",
       "      <td>0.394218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>d__Bacteria|p__Bacteroidetes|o__Bacteroidetes_...</td>\n",
       "      <td>4.857143</td>\n",
       "      <td>31.371429</td>\n",
       "      <td>57.5</td>\n",
       "      <td>0.028867</td>\n",
       "      <td>0.394218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>d__Bacteria|p__Proteobacteria|c__Betaproteobac...</td>\n",
       "      <td>11.571429</td>\n",
       "      <td>31.971429</td>\n",
       "      <td>57.5</td>\n",
       "      <td>0.029385</td>\n",
       "      <td>0.394218</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                family   LVR_mean   NVR_mean  \\\n",
       "79   d__Bacteria|p__Proteobacteria|c__Betaproteobac...  20.000000  89.057143   \n",
       "75   d__Bacteria|p__Proteobacteria|c__Alphaproteoba...   5.857143  19.828571   \n",
       "109  d__Bacteria|p__Proteobacteria|c__Gammaproteoba...   9.285714  42.857143   \n",
       "34   d__Bacteria|p__Bacteroidetes|o__Bacteroidetes_...   4.857143  31.371429   \n",
       "84   d__Bacteria|p__Proteobacteria|c__Betaproteobac...  11.571429  31.971429   \n",
       "\n",
       "     statistic   p_value     p_adj  \n",
       "79        39.5  0.005348  0.394218  \n",
       "75        51.0  0.016463  0.394218  \n",
       "109       52.0  0.018101  0.394218  \n",
       "34        57.5  0.028867  0.394218  \n",
       "84        57.5  0.029385  0.394218  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kraken_family_abunds_v9 = kraken_family_abunds[meta_v9.index]\n",
    "kraken_family_stats_v9_rows = list()\n",
    "for family, row in kraken_family_abunds_v9.iterrows():\n",
    "    lvr_abunds = row[meta_v9.query('VR_group == \"LVR\"').index]\n",
    "    nvr_abunds = row[meta_v9.query('VR_group == \"NVR\"').index]\n",
    "    # check for not all zeros\n",
    "    # lvr_gt_20 = (lvr_abunds != 0).sum()/len(lvr_abunds) > .2\n",
    "    # nvr_gt_20 = (nvr_abunds != 0).sum()/len(nvr_abunds) > .2\n",
    "    lvr_gt_20 = lvr_abunds.sum()/len(lvr_abunds) > 10\n",
    "    nvr_gt_20 = nvr_abunds.sum()/len(nvr_abunds) > 10\n",
    "    if lvr_gt_20 or nvr_gt_20:\n",
    "        stat, p_value = mannwhitneyu(lvr_abunds, nvr_abunds)\n",
    "        kraken_family_stats_v9_rows.append([family, lvr_abunds.mean(), nvr_abunds.mean(), stat, p_value])\n",
    "kraken_family_stats_v9 = pd.DataFrame(kraken_family_stats_v9_rows, columns=['family', 'LVR_mean', 'NVR_mean', 'statistic', 'p_value']).sort_values('p_value')\n",
    "kraken_family_stats_v9['p_adj'] = p_adjust(kraken_family_stats_v9['p_value'])\n",
    "kraken_family_stats_v9.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e165fab2-79e7-490f-ba56-a45d6e0ef8f1",
   "metadata": {},
   "source": [
    "None of the individual genuses that are within the most significant family have compelling results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2dd36d49-5dab-413c-ad91-4395dba7638d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
